For profiling I used TAU (https://tau.uoregon.edu/), since I wanted to try it out. TAU can report how much time was spent on each function, and I used this to report only the time taken on each call to count_sort (or qsort).

For statistical significance each timing should be repeated at least 10 times and so take mean and standard deviation in consideration. I had part of the analysis prepared for this, but not all simulations concluded in time and I reverted to the single case.

Python modules used in analysis:


In [1]:
%matplotlib inline
from IPython.core.display import HTML

from collections import defaultdict

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from analysis_scripts import prepare_panel

A

i-loop

In the i-loop case $i, j, \text{count}$ can be private.

$a$ and $\text{temp}$ should be shared, since $a$ is read-only but can be big (and so have a substantial overhead if it is private) and $\text{temp}$ is written by all threads, but only at an specific position for each thread (so there aren't race conditions).

$n$ is a constant in this context, but it needs to be shared or each thread is going to allocate a new variable, and since it is not initialized there will be errors.


In [30]:
!pygmentize -l c -f html -O full -o ./iloop.html ../src/omp_iloop.c
HTML(open('iloop.html', 'r').read())


Out[30]:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _OPENMP
#include <omp.h>
#endif

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif


void count_sort(int a[], int n) {
  int i, j, count;
  int *temp = malloc(n*sizeof(int));

  #pragma omp parallel for default(none) private(i, j, count) shared(a, temp, n)
  for (i = 0; i < n; i++) {
    count = 0;
    for (j = 0; j < n; j++) {
      if (a[j] < a[i])
        count++;
      else if (a[j] == a[i] && j < i)
        count++;
    }
    temp[count] = a[i];
  }

  #ifndef _OPENMP
  memcpy(a, temp, n*sizeof(int));
  #else

  #pragma omp parallel
  {
    int thread_count = omp_get_num_threads();
    int local_n = n / thread_count;

    #pragma omp parallel for default(none) private(i) shared(n, a, temp, local_n, thread_count)
    for (int i = 0; i < thread_count; i++) {
      int start = i * local_n;
      int nelem = local_n;
      if (i == thread_count - 1) {
        nelem = n - start;
      }
      memcpy(a + start, temp + start, nelem * sizeof(int));
    }
  }

  #endif
  free(temp);
} /* Count sort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  count_sort(vector, VECTOR_SIZE);

  for(int i = 1; i < VECTOR_SIZE; i++) {
    if (vector[i] < vector[i - 1]) {
      return 1;
    }
  }

  return 0;
}

j-loop

In the j-loop case $j$ should be private. $\text{count}$ needs to be shared, since every thread might potentially update it. This also means threads need to wait if another thread is already updating $\text{count}$.

$a$ is read-only but can be big (and so have a substantial overhead if it is private), so I made it shared.

$i$ and $n$ are constant in this context, but they needs to be shared or each thread is going to allocate a new variable, and since it is not initialized there will be errors.


In [3]:
!pygmentize -l c -f html -O full -o ./jloop.html ../src/omp_jloop.c
HTML(open('jloop.html', 'r').read())


Out[3]:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif

#ifdef _OPENMP
#include <omp.h>
#endif

void count_sort(int a[], int n) {
  int i, j, count;
  int *temp = malloc(n*sizeof(int));

  for (i = 0; i < n; i++) {
    count = 0;
    #pragma omp parallel for default(none) private(j) shared(count, i, n, a)
    for (j = 0; j < n; j++) {
      if (a[j] < a[i])
        count++;
      else if (a[j] == a[i] && j < i)
        count++;
    }
    temp[count] = a[i];
  }

  #ifndef _OPENMP
  memcpy(a, temp, n*sizeof(int));
  #else

  #pragma omp parallel
  {
    int thread_count = omp_get_num_threads();
    int local_n = n / thread_count;

    #pragma omp parallel for default(none) private(i) shared(n, a, temp, local_n, thread_count)
    for (int i = 0; i < thread_count; i++) {
      int start = i * local_n;
      int nelem = local_n;
      if (i == thread_count - 1) {
        nelem = n - start;
      }
      memcpy(a + start, temp + start, nelem * sizeof(int));
    }
  }

  #endif
  free(temp);
} /* Count sort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  count_sort(vector, VECTOR_SIZE);

  for(int i = 1; i < VECTOR_SIZE; i++) {
    if (vector[i] < vector[i - 1]) {
      return 1;
    }
  }

  return 0;
}

B

I decided to use $n = 10, 1000, 1000, 10000, 1000000$, and $1, 2, 4, 8, 10, 16, 20$ threads.

For the Xeon Phi I used $1, 2, 4, 8, 16, 32, 64, 128, 240$ threads


In [4]:
VECTOR_SIZE = [10, 100, 1000, 10000, 100000]
THREADS = [1, 2, 4, 8, 10, 16, 20]
PHI_THREADS = [1, 2, 4, 8, 16, 32, 64, 128, 240]

qsort

This is my implementation using qsort.

To keep it consistent with i-loop and j-loop OpenMP implementation (and so share the analysis code), I implemented the qsort call inside another function, $\text{qsort_lib}$.


In [5]:
!pygmentize -l c -f html -O full -o ./qsort.html ../src/qsort.c
HTML(open('qsort.html', 'r').read())


Out[5]:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif


int cmpfunc (const void * a, const void * b)
{
   return ( *(int*)a - *(int*)b );
}


void qsort_lib(int a[], int n) {

  qsort(a, n, sizeof(int), cmpfunc);

} /* stdlib qsort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  qsort_lib(vector, VECTOR_SIZE);

  for(int i = 1; i < VECTOR_SIZE; i++) {
    if (vector[i] < vector[i - 1]) {
      return 1;
    }
  }

  return 0;
}

Preparing the executables and profiling

These commands create the executables, submit it to run on the HPCC job system and finally extract the raw TAU info into a text file. They are all implemented in the Makefile.


In [6]:
!cd .. && make exec/qsort-GNU
!cd .. && make workdir/qsort.pbs
!cd .. && make TAU_PROFILES_QSORT


make: Nothing to be done for `exec/qsort-GNU'.
make: Nothing to be done for `workdir/qsort.pbs'.
make: Nothing to be done for `TAU_PROFILES_QSORT'.

In [7]:
data = prepare_panel('qsort', threads=[1])
pn_qsort = pd.Panel4D(data)
# dimensions: pn[vector size, threads, function name, metric]
pn_qsort.ix[:,:,'qsort_lib','Inclusive usec/call']


Out[7]:
10 100 1000 10000 100000
1 16 122 2025 25396 142965

In [9]:
pn_qsort.ix[:,:,'qsort_lib','Inclusive usec/call'].T.plot(marker='o', logy=True, logx=True)
plt.xlabel('Number of threads')
plt.ylabel('Microseconds')


Out[9]:
<matplotlib.text.Text at 0x2ac92e948dd0>

i-loop


In [12]:
!cd .. && make exec/omp_iloop-GNU
!cd .. && make workdir/omp_iloop.pbs
!cd .. && make TAU_PROFILES_ILOOP


make: Nothing to be done for `exec/omp_iloop-GNU'.
make: Nothing to be done for `workdir/omp_iloop.pbs'.
make: Nothing to be done for `TAU_PROFILES_ILOOP'.

In [13]:
data = prepare_panel('omp_iloop')
pn_iloop = pd.Panel4D(data)
# dimensions: pn[vector size, threads, function name, metric]
pn_iloop.ix[:,:,'count_sort','Inclusive usec/call']


Out[13]:
10 100 1000 10000 100000
1 13 81 6571 666366 64873847
2 129 172 3449 333217 33027419
4 218 280 2050 179163 17085422
8 409 482 1416 96269 9071035
10 510 526 1244 74052 7690740
16 855 804 1328 47208 4524382
20 60889 61554 2103 115002 3783039

In [14]:
pn_iloop.ix[:,:,'count_sort','Inclusive usec/call'].plot(marker='o', logx=True, logy=True)
plt.xlabel('Number of threads')
plt.ylabel('Microseconds')


Out[14]:
<matplotlib.text.Text at 0x2ac92e8904d0>

Speedup

Using threads = 1 as base case


In [15]:
speedup_iloop = pn_iloop.ix[:,1,'count_sort','Inclusive usec/call'] / pn_iloop.ix[:,:,'count_sort','Inclusive usec/call']
speedup_iloop


Out[15]:
10 100 1000 10000 100000
1 1 1 1 1 1
2 0.1007752 0.4709302 1.90519 1.999796 1.964242
4 0.05963303 0.2892857 3.205366 3.719328 3.797029
8 0.03178484 0.1680498 4.640537 6.921917 7.151758
10 0.0254902 0.1539924 5.282154 8.998623 8.435319
16 0.01520468 0.1007463 4.948042 14.11553 14.33872
20 0.0002135033 0.001315918 3.124584 5.794386 17.14861

In [16]:
speedup_iloop.plot()
plt.plot(THREADS, THREADS, '--', label='Linear speedup')
plt.legend(loc='best')
plt.xlabel('Number of threads')
plt.ylabel('Speedup')
plt.title("$i$-loop parallel for")


Out[16]:
<matplotlib.text.Text at 0x2ac92ed29ed0>

As expected, for small vector sizes synchronization and small workloads make the function actually slower than the sequential case.

For bigger vector sizes there is a significant speedup, specially in the $n=100000$ case.

Probably for $t=20, v=10000$ there was a competition for resources in other processes running on the node (and that's why more repetition and means/standard deviation is needed).

Using qsort as base case


In [17]:
speedup_iloop_qsort = pn_qsort.ix[:,1,'qsort_lib','Inclusive usec/call'] / pn_iloop.ix[:,:,'count_sort','Inclusive usec/call']
speedup_iloop_qsort


Out[17]:
10 100 1000 10000 100000
1 1.230769 1.506173 0.3081723 0.03811119 0.002203739
2 0.124031 0.7093023 0.5871267 0.0762146 0.004328676
4 0.0733945 0.4357143 0.9878049 0.141748 0.00836766
8 0.0391198 0.253112 1.430085 0.2638025 0.01576061
10 0.03137255 0.2319392 1.627814 0.3429482 0.01858924
16 0.01871345 0.1517413 1.524849 0.5379597 0.03159879
20 0.0002627732 0.001982 0.9629101 0.2208309 0.03779105

In [18]:
speedup_iloop_qsort.plot()
plt.legend(loc='best')
plt.xlabel('Number of threads')
plt.ylabel('Speedup')
plt.title("$i$-loop parallel for")


Out[18]:
<matplotlib.text.Text at 0x2ac92ef6a390>

Using qsort as base case we can see how a better algorithm is more important on the long run. Since qsort is $O(n\times log n)$ in the average case, the parallel optimization of a $O(n^2)$ algorithm is still slower.

Despite that, for small vector sizes count_sort was actually faster on the sequential case.

j-loop


In [24]:
!cd .. && make exec/omp_jloop-GNU
!cd .. && make workdir/omp_jloop.pbs
!cd .. && make TAU_PROFILES_JLOOP


make: Nothing to be done for `TAU_PROFILES_JLOOP'.

In [26]:
data = prepare_panel('omp_jloop')
pn_jloop = pd.Panel4D(data)
# dimensions: pn[vector size, threads, function name, metric]
pn_jloop.ix[:,:,'count_sort','Inclusive usec/call']


Out[26]:
10 100 1000 10000 100000
1 14 89 7045 658315 66863486
2 95 437 11575 663478 103398272
4 158 516 16862 1308516 152685130
8 255 728 16512 1551742 101959000
10 297 798 22860 1480709 132550371
16 383 1079 23006 1413372 134742461
20 457 117537 26345 5533326 138720191

In [27]:
pn_jloop.ix[:,:,'count_sort','Inclusive usec/call'].plot(marker='o', logx=True, logy=True)
plt.xlabel('Number of threads')
plt.ylabel('Microseconds')


Out[27]:
<matplotlib.text.Text at 0x2ac92f04c390>

Speedup

Using threads = 1 as base case


In [28]:
speedup_jloop = pn_jloop.ix[:,1,'count_sort','Inclusive usec/call'] / pn_jloop.ix[:,:,'count_sort','Inclusive usec/call']
speedup_jloop


Out[28]:
10 100 1000 10000 100000
1 1 1 1 1 1
2 0.1473684 0.2036613 0.6086393 0.9922183 0.6466596
4 0.08860759 0.1724806 0.4178033 0.5031005 0.4379175
8 0.05490196 0.1222527 0.4266594 0.4242426 0.655788
10 0.04713805 0.1115288 0.3081802 0.4445944 0.5044383
16 0.03655352 0.08248378 0.3062245 0.4657762 0.4962317
20 0.03063457 0.0007572084 0.2674132 0.1189727 0.4820026

In [29]:
speedup_jloop.plot()
plt.legend(loc='best')
plt.xlabel('Number of threads')
plt.ylabel('Speedup')
plt.title("$j$-loop parallel for")


Out[29]:
<matplotlib.text.Text at 0x2ac92f04c150>

For the j-loop case adding more threads just made the function slower, and this is expected because $\text{count}$ needs to be synchronized and other threads need to wait to update it.

Using qsort as base case


In [37]:
speedup_jloop_qsort = pn_qsort.ix[:,1,'qsort_lib','Inclusive usec/call'] / pn_jloop.ix[:,:,'count_sort','Inclusive usec/call']
speedup_jloop_qsort


Out[37]:
10 100 1000 10000 100000
1 0.8571429 1.576471 0.3314759 0.03836686 0.002292281
2 0.08510638 0.3059361 0.1791616 0.02760308 0.002624113
4 0.09022556 0.2696177 0.09060223 0.01911403 0.00076936
8 0.04562738 0.1654321 0.1081519 0.0137191 0.00086816
10 0.04669261 0.1398747 0.2314163 0.01711848 0.002791292
16 0.03252033 0.1392931 0.09790054 0.01741024 0.001071169
20 0.01355932 0.1247672 0.04817563 0.01558582 0.001039703

In [39]:
speedup_jloop_qsort.plot()
plt.legend(loc='best')
plt.xlabel('Number of threads')
plt.ylabel('Speedup')
plt.title("$i$-loop parallel for")


Out[39]:
<matplotlib.text.Text at 0x2abaa011b510>

For the j-loop case adding more threads just made the function slower, and this is expected because $\text{count}$ needs to be synchronized and other threads need to wait to update it.

C

qsort

For the Xeon Phi implementation of qsort I had to declare cmpfunc as 'runnable' on a MIC architecture. The only modification besides that is the #pragma offload over the qsort call.


In [59]:
!pygmentize -l c -f html -O full -o ./mic_qsort.html ../src/mic_qsort.c
HTML(open('mic_qsort.html', 'r').read())


Out[59]:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif


__attribute__((target(mic)))
int cmpfunc (const void * a, const void * b)
{
   return ( *(int*)a - *(int*)b );
}


void qsort_lib(int a[], int n) {

  #pragma offload target(mic) inout(a:length(n))
  qsort(a, n, sizeof(int), cmpfunc);

} /* stdlib qsort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  qsort_lib(vector, VECTOR_SIZE);

  return 0;
}

i-loop

For the i-loop Xeon Phi implementation I added the #pragma offload before the OpenMP parallel for.


In [60]:
!pygmentize -l c -f html -O full -o ./mic_iloop.html ../src/mic_iloop.c
HTML(open('mic_iloop.html', 'r').read())


Out[60]:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif

#ifdef _OPENMP
#include <omp.h>
#endif

void count_sort(int a[], int n) {
  int i, j, count;
  int *temp = malloc(n*sizeof(int));

  #pragma offload target(mic) in(a:length(n)) out(temp:length(n))
  {
  #pragma omp parallel for default(none) private(i, j, count) shared(a, temp, n)
  for (i = 0; i < n; i++) {
    count = 0;
    for (j = 0; j < n; j++) {
      if (a[j] < a[i])
        count++;
      else if (a[j] == a[i] && j < i)
        count++;
    }
    temp[count] = a[i];
  }
  }

  #ifndef _OPENMP
  memcpy(a, temp, n*sizeof(int));
  #else

  #pragma omp parallel
  {
    int thread_count = omp_get_num_threads();
    int local_n = n / thread_count;

    #pragma omp parallel for default(none) private(i) shared(n, a, temp, local_n, thread_count)
    for (int i = 0; i < thread_count; i++) {
      int start = i * local_n;
      int nelem = local_n;
      if (i == thread_count - 1) {
        nelem = n - start;
      }
      memcpy(a + start, temp + start, nelem * sizeof(int));
    }
  }

  #endif
  free(temp);
} /* Count sort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  count_sort(vector, VECTOR_SIZE);

  return 0;
}

Speedup

Since I used TAU for the other analysis, I wanted to use TAU for the Xeon Phi cases too. But HPCC didn't have it installed at first, and after it was installed the profile data wasn't generated. I contacted HPCC support but didn't got an answer in time for the homework.

D

It should be compute-bound, but ends up being memory-bound because of the bad memory access pattern. For each element of the array all the previous elements need to be accessed again, and more efficient sorting algorithms (like merge sort or even quicksort) can have better access patterns than count sort.